tidyverse for data wrangling, gridExtra for arranging multiple plots, ggplot for plotting.
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library("ggplot2")
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Grab data for each year. Most of the data processing here should be done with for-loop and list, but wasn’t completely sure on how to do it properly. So each year’s dataset is processed one by one separately.
df_2015 <- read.csv("/Users/lubis/Documents/study/dataanalysis/hackathon/world-happiness/2015.csv", header=TRUE)
df_2016 <- read.csv("/Users/lubis/Documents/study/dataanalysis/hackathon/world-happiness/2016.csv", header=TRUE)
df_2017 <- read.csv("/Users/lubis/Documents/study/dataanalysis/hackathon/world-happiness/2017.csv", header=TRUE)
df_2018 <- read.csv("/Users/lubis/Documents/study/dataanalysis/hackathon/world-happiness/2018.csv", header=TRUE)
df_2019 <- read.csv("/Users/lubis/Documents/study/dataanalysis/hackathon/world-happiness/2019.csv", header=TRUE)
Check types of each year to work only on the common ones.
str(df_2015)
## 'data.frame': 158 obs. of 12 variables:
## $ Country : Factor w/ 158 levels "Afghanistan",..: 136 59 38 106 25 46 100 135 101 7 ...
## $ Region : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 6 10 10 10 1 1 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.59 7.56 7.53 7.52 7.43 ...
## $ Standard.Error : num 0.0341 0.0488 0.0333 0.0388 0.0355 ...
## $ Economy..GDP.per.Capita. : num 1.4 1.3 1.33 1.46 1.33 ...
## $ Family : num 1.35 1.4 1.36 1.33 1.32 ...
## $ Health..Life.Expectancy. : num 0.941 0.948 0.875 0.885 0.906 ...
## $ Freedom : num 0.666 0.629 0.649 0.67 0.633 ...
## $ Trust..Government.Corruption.: num 0.42 0.141 0.484 0.365 0.33 ...
## $ Generosity : num 0.297 0.436 0.341 0.347 0.458 ...
## $ Dystopia.Residual : num 2.52 2.7 2.49 2.47 2.45 ...
str(df_2016)
## 'data.frame': 157 obs. of 13 variables:
## $ Country : Factor w/ 157 levels "Afghanistan",..: 38 135 58 104 45 26 98 99 7 134 ...
## $ Region : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 6 10 1 1 10 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.53 7.51 7.5 7.5 7.41 ...
## $ Lower.Confidence.Interval : num 7.46 7.43 7.33 7.42 7.35 ...
## $ Upper.Confidence.Interval : num 7.59 7.59 7.67 7.58 7.47 ...
## $ Economy..GDP.per.Capita. : num 1.44 1.53 1.43 1.58 1.41 ...
## $ Family : num 1.16 1.15 1.18 1.13 1.13 ...
## $ Health..Life.Expectancy. : num 0.795 0.863 0.867 0.796 0.811 ...
## $ Freedom : num 0.579 0.586 0.566 0.596 0.571 ...
## $ Trust..Government.Corruption.: num 0.445 0.412 0.15 0.358 0.41 ...
## $ Generosity : num 0.362 0.281 0.477 0.379 0.255 ...
## $ Dystopia.Residual : num 2.74 2.69 2.83 2.66 2.83 ...
str(df_2017)
## 'data.frame': 155 obs. of 12 variables:
## $ Country : Factor w/ 155 levels "Afghanistan",..: 105 38 58 133 45 99 26 100 132 7 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.54 7.52 7.5 7.49 7.47 ...
## $ Whisker.high : num 7.59 7.58 7.62 7.56 7.53 ...
## $ Whisker.low : num 7.48 7.46 7.39 7.43 7.41 ...
## $ Economy..GDP.per.Capita. : num 1.62 1.48 1.48 1.56 1.44 ...
## $ Family : num 1.53 1.55 1.61 1.52 1.54 ...
## $ Health..Life.Expectancy. : num 0.797 0.793 0.834 0.858 0.809 ...
## $ Freedom : num 0.635 0.626 0.627 0.62 0.618 ...
## $ Generosity : num 0.362 0.355 0.476 0.291 0.245 ...
## $ Trust..Government.Corruption.: num 0.316 0.401 0.154 0.367 0.383 ...
## $ Dystopia.Residual : num 2.28 2.31 2.32 2.28 2.43 ...
str(df_2018)
## 'data.frame': 156 obs. of 9 variables:
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.or.region : Factor w/ 156 levels "Afghanistan",..: 45 106 38 58 134 100 26 101 133 7 ...
## $ Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ GDP.per.capita : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Social.support : num 1.59 1.58 1.59 1.64 1.55 ...
## $ Healthy.life.expectancy : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom.to.make.life.choices: num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Perceptions.of.corruption : Factor w/ 111 levels "0.000","0.001",..: 107 103 108 78 104 99 98 106 105 100 ...
str(df_2019)
## 'data.frame': 156 obs. of 9 variables:
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.or.region : Factor w/ 156 levels "Afghanistan",..: 44 37 106 58 99 134 133 100 24 7 ...
## $ Score : num 7.77 7.6 7.55 7.49 7.49 ...
## $ GDP.per.capita : num 1.34 1.38 1.49 1.38 1.4 ...
## $ Social.support : num 1.59 1.57 1.58 1.62 1.52 ...
## $ Healthy.life.expectancy : num 0.986 0.996 1.028 1.026 0.999 ...
## $ Freedom.to.make.life.choices: num 0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
## $ Generosity : num 0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
## $ Perceptions.of.corruption : num 0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...
Since the feature names change in some of the years, we specific on which features to work, and renaming the features correspondingly. 1. Country.or.region renamed into Country 2. Happiness.Rank renamed into Overall.rank 3. Happiness.Score renamed into Score 4. Economy..GDP.per.Capita. renamed into GDP.per.capita 5. Health..Life.Expectancy. renamed itno Healthy.life.expectancy 6. Freedom renamed into Freedom.to.make.life.choices 7. Trust..Government.Corruption renamed into Perceptions.of.corruption
For 2015, aside form renaming some features, 3 features (Standard.Error, Family, Dystopia.Residual) are removed.
df_2015_rename <- df_2015 %>%
rename(Overall.rank = Happiness.Rank, Score = Happiness.Score, GDP.per.capita = Economy..GDP.per.Capita.,
Healthy.life.expectancy = Health..Life.Expectancy., Perceptions.of.corruption = Trust..Government.Corruption.) %>%
select(-c(Standard.Error, Family, Dystopia.Residual))
str(df_2015_rename)
## 'data.frame': 158 obs. of 9 variables:
## $ Country : Factor w/ 158 levels "Afghanistan",..: 136 59 38 106 25 46 100 135 101 7 ...
## $ Region : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 6 10 10 10 1 1 ...
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Score : num 7.59 7.56 7.53 7.52 7.43 ...
## $ GDP.per.capita : num 1.4 1.3 1.33 1.46 1.33 ...
## $ Healthy.life.expectancy : num 0.941 0.948 0.875 0.885 0.906 ...
## $ Freedom : num 0.666 0.629 0.649 0.67 0.633 ...
## $ Perceptions.of.corruption: num 0.42 0.141 0.484 0.365 0.33 ...
## $ Generosity : num 0.297 0.436 0.341 0.347 0.458 ...
2016 is treated the same way as 2015.
df_2016_rename <- df_2016 %>%
rename(Overall.rank = Happiness.Rank, Score = Happiness.Score, GDP.per.capita = Economy..GDP.per.Capita.,
Healthy.life.expectancy = Health..Life.Expectancy., Perceptions.of.corruption = Trust..Government.Corruption.) %>%
select(-c(Lower.Confidence.Interval, Upper.Confidence.Interval, Family, Dystopia.Residual))
str(df_2016_rename)
## 'data.frame': 157 obs. of 9 variables:
## $ Country : Factor w/ 157 levels "Afghanistan",..: 38 135 58 104 45 26 98 99 7 134 ...
## $ Region : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 6 10 1 1 10 ...
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Score : num 7.53 7.51 7.5 7.5 7.41 ...
## $ GDP.per.capita : num 1.44 1.53 1.43 1.58 1.41 ...
## $ Healthy.life.expectancy : num 0.795 0.863 0.867 0.796 0.811 ...
## $ Freedom : num 0.579 0.586 0.566 0.596 0.571 ...
## $ Perceptions.of.corruption: num 0.445 0.412 0.15 0.358 0.41 ...
## $ Generosity : num 0.362 0.281 0.477 0.379 0.255 ...
For 2017, 4 features (Whisker.high, Whisker.low, Family, Dystopia.Residual) are removed.
df_2017_rename <- df_2017 %>%
rename(Overall.rank = Happiness.Rank, Score = Happiness.Score, GDP.per.capita = Economy..GDP.per.Capita.,
Healthy.life.expectancy = Health..Life.Expectancy., Perceptions.of.corruption = Trust..Government.Corruption.) %>%
select(-c(Whisker.high, Whisker.low, Family, Dystopia.Residual))
str(df_2017_rename)
## 'data.frame': 155 obs. of 8 variables:
## $ Country : Factor w/ 155 levels "Afghanistan",..: 105 38 58 133 45 99 26 100 132 7 ...
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Score : num 7.54 7.52 7.5 7.49 7.47 ...
## $ GDP.per.capita : num 1.62 1.48 1.48 1.56 1.44 ...
## $ Healthy.life.expectancy : num 0.797 0.793 0.834 0.858 0.809 ...
## $ Freedom : num 0.635 0.626 0.627 0.62 0.618 ...
## $ Generosity : num 0.362 0.355 0.476 0.291 0.245 ...
## $ Perceptions.of.corruption: num 0.316 0.401 0.154 0.367 0.383 ...
2018 and 2019 go through similar process. Some features are renamed, and 1 feature (Social.support) is removed from the DataFrame.
df_2018_rename <- df_2018 %>%
rename(Country = Country.or.region, Freedom = Freedom.to.make.life.choices) %>%
select(-c(Social.support))
df_2019_rename <- df_2019 %>%
rename(Country = Country.or.region, Freedom = Freedom.to.make.life.choices) %>%
select(-c(Social.support))
str(df_2018_rename)
## 'data.frame': 156 obs. of 8 variables:
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : Factor w/ 156 levels "Afghanistan",..: 45 106 38 58 134 100 26 101 133 7 ...
## $ Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ GDP.per.capita : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Healthy.life.expectancy : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom : num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Perceptions.of.corruption: Factor w/ 111 levels "0.000","0.001",..: 107 103 108 78 104 99 98 106 105 100 ...
str(df_2019_rename)
## 'data.frame': 156 obs. of 8 variables:
## $ Overall.rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : Factor w/ 156 levels "Afghanistan",..: 44 37 106 58 99 134 133 100 24 7 ...
## $ Score : num 7.77 7.6 7.55 7.49 7.49 ...
## $ GDP.per.capita : num 1.34 1.38 1.49 1.38 1.4 ...
## $ Healthy.life.expectancy : num 0.986 0.996 1.028 1.026 0.999 ...
## $ Freedom : num 0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
## $ Generosity : num 0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
## $ Perceptions.of.corruption: num 0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...
All of these renamed DataFrames are put in new variables with "_rename" attached to their name.
| TAKE REGION AND PLUG TO THE REMAINING YEARS |
| The feature Region which denotes the region of each country (North America, Western Europe, etc.) is missing from 2017. So this needs to be added to the later datasets (2017, 2018, 2019) |
plug_region <- df_2015 %>% select(Country, Region)
df_2017_rename <- left_join(df_2017_rename, plug_region)
## Joining, by = "Country"
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
df_2018_rename <- left_join(df_2018_rename, plug_region)
## Joining, by = "Country"
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
df_2019_rename <- left_join(df_2019_rename, plug_region)
## Joining, by = "Country"
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
Add new feature year for each dataset.
df_2015_rename$year <- 2015
df_2016_rename$year <- 2016
df_2017_rename$year <- 2017
df_2018_rename$year <- 2018
df_2019_rename$year <- 2019
Bind rows is just stacking everything by rows, distinguished by year.
df <- bind_rows(df_2015_rename, df_2016_rename, df_2017_rename, df_2019_rename)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
Then plot using plotly.
p <- ggplot(df, aes(Healthy.life.expectancy, Score, color = Region)) +
geom_point(alpha=0.5, aes(size = GDP.per.capita, frame = year, ids = Country)) +
scale_x_log10() + xlim(0.05, 1.00)
## Warning: Ignoring unknown aesthetics: frame, ids
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
p <- ggplotly(p)
p
## Warning in p$x$data[firstFrame] <- p$x$frames[[1]]$data: number of items to
## replace is not a multiple of replacement length